Load Data

df <- read.csv("data/eventbride_18_04_14.csv", comment.char="#", stringsAsFactors=FALSE)

Reduce Dataframe

keeps <- c('faz.net',"focus.de",
           "handelsblatt.com","n-tv.de","spiegel.de",
           "stern.de","sueddeutsche.de",
           "tagesschau.de", "welt.de", "zeit.de")

df <- df %>%
  mutate(text = body,
         # Extract site 
         site = str_extract(source, "(?<='uri': ')[A-z][^']*"),
         date = as.Date(date)) %>%
  select(date,title,text,site,url,isDuplicate) %>%
  filter(site %in% keeps) %>%
  mutate(title_text = paste(title, text, sep=" "))

df_facebook <- df %>%
  filter(grepl("facebook",title, ignore.case = T))
# Calculate text length (number of words)
df_facebook$text_length <- sapply(gregexpr("\\S+", 
                                           df_facebook$text), length)
ggplot(df_facebook, aes(text_length, group=site,
                        color=site)) +
  geom_density() +
  labs(x="", title = "Word count", color = "")

# Filtering

df_facebook <- df_facebook %>%
  filter(text_length > 100) %>%

  # remove articles that contain daily overviews
  filter(!grepl("Nachrichten am Morgen", title)) %>%
  filter(!grepl("Der Morgen live", title)) %>%
  filter(!grepl("Die Lage am", title)) %>%
  filter(!startsWith(title,"News")) %>%

  # remove articles that only contain video 
  filter(!grepl("Video einbetten Nutzungsbedingungen Embedding Tagesschau", title_text)) %>%
  filter(!grepl("</div>", title_text)) %>%
  
  # remove text that mostly contain user comments
  filter(!startsWith(text,"1.")) %>%
  
  # remove articles behind a pay-wall
  filter(!grepl("SPIEGEL-Plus-Artikel", text)) 
ggplot(df_facebook, aes(site)) +
  geom_bar(fill = col[3], alpha=0.7) +
  labs(x="", title = "Number of Articles") +
  theme(axis.text.x = element_text(angle = 60, size=10))

stem_text<- function(text, language = "porter", mc.cores = 1) {
  # stem each word in a block of text
  stem_string <- function(str, language) {
    str <- strsplit(x = str, split = "\\s")
    str <- SnowballC::wordStem(unlist(str), language = language)
    str <- paste(str, collapse = " ")
    return(str)
  }
   
  # stem each text block in turn
  x <- mclapply(X = text, FUN = stem_string, language, mc.cores = mc.cores)
   
  # return stemed text blocks
  return(unlist(x))
}

df_facebook$text_cleaned <- stem_text(df_facebook$text_cleaned)

Term Frequency

token <- df_facebook %>%
  group_by(site) %>%
  unnest_tokens(word, text_cleaned) %>%
  dplyr::count(site, word, sort = TRUE)  %>%
  bind_tf_idf(word, site, n) %>%
  dplyr::arrange(desc(tf_idf))

token %>%
  arrange(desc(tf)) %>%
  arrange(site) %>%
  top_n(5) %>%
  knitr::kable(align = "l")
site word n tf idf tf_idf
faz.net amerikanischen 43 0.0042194 0.3566749 0.0015050
faz.net palantir 23 0.0022569 0.6931472 0.0015644
faz.net procter 10 0.0009813 2.3025851 0.0022594
faz.net gambl 6 0.0005888 2.3025851 0.0013557
faz.net hässig 6 0.0005888 2.3025851 0.0013557
focus.de iri 10 0.0014618 2.3025851 0.0033659
focus.de malter 10 0.0014618 2.3025851 0.0033659
focus.de modamani 6 0.0008771 2.3025851 0.0020195
focus.de darm 5 0.0007309 2.3025851 0.0016829
focus.de gaffer 5 0.0007309 2.3025851 0.0016829
focus.de jobcent 5 0.0007309 2.3025851 0.0016829
handelsblatt.com ap 28 0.0014968 2.3025851 0.0034466
handelsblatt.com rtr 25 0.0013365 2.3025851 0.0030773
handelsblatt.com hutter 20 0.0010692 1.6094379 0.0017208
handelsblatt.com snb 16 0.0008553 2.3025851 0.0019695
handelsblatt.com ma 14 0.0007484 2.3025851 0.0017233
n-tv.de skript 8 0.0014278 1.6094379 0.0022980
n-tv.de io 8 0.0014278 1.2039728 0.0017190
n-tv.de datenriesen 4 0.0007139 2.3025851 0.0016438
n-tv.de drastischer 4 0.0007139 2.3025851 0.0016438
n-tv.de motherboard 4 0.0007139 2.3025851 0.0016438
n-tv.de nonsen 4 0.0007139 2.3025851 0.0016438
n-tv.de smilei 4 0.0007139 2.3025851 0.0016438
n-tv.de synchronisierung 4 0.0007139 2.3025851 0.0016438
spiegel.de netzwelt 19 0.0025631 2.3025851 0.0059017
spiegel.de sonnenfeld 17 0.0022933 2.3025851 0.0052804
spiegel.de anja 8 0.0010792 2.3025851 0.0024849
spiegel.de kregeloh 6 0.0008094 2.3025851 0.0018637
spiegel.de regina 6 0.0008094 2.3025851 0.0018637
stern.de polizei 25 0.0021901 1.2039728 0.0026368
stern.de skripal 23 0.0020149 1.6094379 0.0032428
stern.de museum 16 0.0014017 2.3025851 0.0032275
stern.de malt 13 0.0011389 2.3025851 0.0026223
stern.de mansholt 13 0.0011389 2.3025851 0.0026223
sueddeutsche.de mackai 8 0.0010275 2.3025851 0.0023659
sueddeutsche.de stiftung 7 0.0008990 1.6094379 0.0014470
sueddeutsche.de lehman 6 0.0007706 2.3025851 0.0017744
sueddeutsche.de live 5 0.0006422 2.3025851 0.0014787
sueddeutsche.de matter 5 0.0006422 2.3025851 0.0014787
tagesschau.de studio 8 0.0031471 2.3025851 0.0072465
tagesschau.de gebhart 6 0.0023603 2.3025851 0.0054349
tagesschau.de werbeindustri 4 0.0015736 1.2039728 0.0018945
tagesschau.de marcu 3 0.0011802 2.3025851 0.0027174
tagesschau.de schuler 3 0.0011802 2.3025851 0.0027174
welt.de verimi 21 0.0012861 2.3025851 0.0029613
welt.de nast 13 0.0007961 2.3025851 0.0018332
welt.de wambach 11 0.0006736 2.3025851 0.0015511
welt.de unheimlich 10 0.0006124 2.3025851 0.0014101
welt.de eigentum 8 0.0004899 2.3025851 0.0011281
zeit.de garcía 9 0.0008724 2.3025851 0.0020088
zeit.de martínez 9 0.0008724 2.3025851 0.0020088
zeit.de wenger 8 0.0007755 2.3025851 0.0017856
zeit.de elbvertiefung 7 0.0006786 2.3025851 0.0015624
zeit.de picabia 6 0.0005816 2.3025851 0.0013392

Bigrams

bigrams <- df_facebook %>%
  unnest_tokens(bigram, text_cleaned, token="ngrams", n=2)

bigrams %>%
  group_by(site) %>%
  count(bigram) %>%
  arrange(desc(n)) %>%
  top_n(5) %>%
  knitr::kable(align = "l")
## Selecting by n
site bigram n
handelsblatt.com cambridg analytica 173
welt.de cambridg analytica 132
zeit.de cambridg analytica 104
handelsblatt.com mark zuckerberg 97
faz.net cambridg analytica 88
sueddeutsche.de cambridg analytica 70
spiegel.de cambridg analytica 68
welt.de mark zuckerberg 68
handelsblatt.com facebook chef 59
handelsblatt.com millionen facebook 56
focus.de cambridg analytica 55
stern.de cambridg analytica 55
handelsblatt.com daten millionen 54
n-tv.de cambridg analytica 47
zeit.de mark zuckerberg 47
welt.de facebook chef 42
faz.net mark zuckerberg 39
welt.de donald trump 39
stern.de mark zuckerberg 35
sueddeutsche.de mark zuckerberg 33
zeit.de millionen facebook 33
n-tv.de mark zuckerberg 32
welt.de daten millionen 32
zeit.de daten millionen 32
faz.net sozial netzwerk 30
zeit.de facebook chef 30
sueddeutsche.de facebook chef 28
faz.net facebook nutzern 26
faz.net millionen facebook 26
spiegel.de mark zuckerberg 26
sueddeutsche.de daten millionen 26
tagesschau.de cambridg analytica 26
focus.de mark zuckerberg 25
n-tv.de facebook chef 24
focus.de donald trump 23
n-tv.de daten millionen 23
stern.de facebook chef 23
sueddeutsche.de u kongress 22
stern.de donald trump 21
stern.de u kongress 21
focus.de millionen facebook 20
spiegel.de facebook chef 19
focus.de daten millionen 18
focus.de facebook chef 18
focus.de facebook nutzern 18
n-tv.de donald trump 18
spiegel.de donald trump 18
tagesschau.de mark zuckerberg 15
tagesschau.de facebook chef 12
spiegel.de datenskand facebook 10
spiegel.de millionen facebook 10
tagesschau.de daten millionen 10
tagesschau.de donald trump 9
tagesschau.de facebook nutzern 9
tagesschau.de millionen facebook 9

Wordcloud

library(quanteda)
all.corpus <- corpus(df_facebook$text_cleaned)
df.corpus <- dfm(all.corpus)

textplot_wordcloud(df.corpus, max.word=200,
                   colors = col)

SentimentR

pacman::p_load(sentimentr)

Load dictionary

# Load dictionaries (from: http://wortschatz.uni-leipzig.de/de/download)
neg_df <- read_tsv("dict/SentiWS_v1.8c_Negative.txt", col_names = FALSE)
pos_df <- read_tsv("dict/SentiWS_v1.8c_Positive.txt", col_names = FALSE)

sentiment_df <- bind_rows(neg_df,pos_df)
names(sentiment_df) <- c("Wort_POS", "polarity", "Inflektionen")

sentiment_df %>% 
  mutate(words = str_sub(Wort_POS, 1, regexpr("\\|", .$Wort_POS)-1),
         words = tolower(words)
         #POS = str_sub(Wort_POS, start = regexpr("\\|", .$Wort_POS)+1)
         ) %>%
  select(words, polarity) -> sentiment_df

sentiment_df <- rbind(sentiment_df, c("nicht",-0.8))

sentiment_df %>% mutate(polarity = as.numeric(polarity)) %>%
  as_key() -> sentiment_df

Apply on Data

We may wish to see the output from sentiment_by line by line with positive/negative sentences highlighted. The highlight function wraps a sentiment_by output to produces a highlighted HTML file (positive = green; negative = pink). Lets have a look at random articles here.

Lets apply this on the whole corpus.

sent_df <- df_facebook %>% 
  mutate(split = get_sentences(title_text)) %$%
  sentiment(split,
               polarity_dt = sentiment_df)
  
df_facebook$element_id <- as.numeric(rownames(df_facebook))
df_facebook <- left_join(df_facebook, sent_df %>%
                        select(element_id, sentiment),
                      by="element_id")
df_facebook %>%
  group_by(site) %>%
  mutate(ave_sentiment = mean(sentiment)) -> plot
p1 <- plot %>%
  ggplot(aes(sentiment, site, text=title)) +
  geom_point(color="blue", alpha=.5, shape=1) +
  geom_point(aes(ave_sentiment, site), color="red", size=2) +
  xlim(c(-0.3,0.3)) +
  labs(y="")

plotly::ggplotly(p1)